install some packages or libraries
library(ggplot2)
library(ezids)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
Read the data .CSV files of adults arrest in DC area from 2016-2021
df_2016<-data.frame(read.csv("Arrests 2016 Public.csv"))
df_2017<-data.frame(read.csv("Arrests 2017 Public.csv"))
df_2018<-data.frame(read.csv("Arrests by Year, 2018.csv"))
df_2019<-data.frame(read.csv("Arrests by Year, 2019.csv"))
df_2020<-data.frame(read.csv("Arrests by Year 2020.csv"))
df_2021<-data.frame(read.csv("2021 Adult Arrests.csv"))
c16 <- c(colnames(df_2016))
c18 <- c(colnames(df_2018))
Note that the colnames of df_2016 and df_2017 are not the same with others. We show the columns name of df_2016 and df_2018 in the below:
| col # | 2016 | 2018 |
|---|---|---|
| 1 | Arrestee.Type | Arrestee.Type |
| 2 | Arrest.Year | Arrest.Year |
| 3 | Arrest.Date | Arrest.Date |
| 4 | Arrest.Hour | Arrest.Hour |
| 5 | CCN | CCN |
| 6 | Arrest.Number. | Arrest.Number. |
| 7 | Age | Age |
| 8 | Defendant.PSA | Defendant.PSA |
| 9 | Defendant.District | Defendant.District |
| 10 | Defendant.Race | Defendant.Race |
| 11 | Defendant.Ethnicity | Defendant.Ethnicity |
| 12 | Defendant.Sex | Defendant.Sex |
| 13 | Arrest.Category | Arrest.Category |
| 14 | Charge.Description | Charge.Description |
| 15 | Arrest.Location.PSA | Arrest.Location.PSA |
| 16 | Arrest.Location.District | Arrest.Location.District |
| 17 | Arrest.Location.Block.GeoX | Arrest.Block.GEOX |
| 18 | Arrest.Location.Block.GeoY | Arrest.Block.GEOY |
| 19 | Offense.GEOY | Arrest.Latitude |
| 20 | Offense.GEOX | Arrest.Longitude |
| 21 | Offense.PSA | Offense.Location.PSA |
| 22 | Offense.District | Offense.Location.District |
| 23 | Arrest.Latitude | Offense.Block.GEOX |
| 24 | Arrest.Longitude | Offense.Block.GEOY |
| 25 | Offense.Latitude | Offense.Latitude |
| 26 | Offense.Longitude | Offense.Longitude |
You can see that the columns name are same from the first column to
the 14th column in both data. On the other hand, the name and order of
15th and latter columns are a bit different in those data. The latter
columns are about locations, and we are not very interested in the
detail location. Therefore, we will delete the latter columns except for
the 16th and 22nd columns. In addition, we will drop CNN
(col #5) and Arrest.Number. (col #6) because they are IDs
and useless for our analysis.
After deleting some columns, we will bind data frames by rows.
#bind df_2016 and df_2017, and delete some columns
df_16_17 <- rbind(df_2016, df_2017)[,-c(5,6,15,17:21,23:26)]
names(df_16_17)[c(13,14)] <- c('Arrest.Location.District','Offense.Location.District') #rename columns
#bind df_2018 - df_2021, and delete some columns
df_18_21 <- rbind(df_2018, df_2019, df_2020, df_2021)[,-c(5,6,15,17:21,23:26)]
DF<-rbind(df_16_17,df_18_21)
colnames(DF)
## [1] "Arrestee.Type" "Arrest.Year"
## [3] "Arrest.Date" "Arrest.Hour"
## [5] "Age" "Defendant.PSA"
## [7] "Defendant.District" "Defendant.Race"
## [9] "Defendant.Ethnicity" "Defendant.Sex"
## [11] "Arrest.Category" "Charge.Description"
## [13] "Arrest.Location.District" "Offense.Location.District"
str(DF)
## 'data.frame': 152386 obs. of 14 variables:
## $ Arrestee.Type : chr "Adult Arrest" "Adult Arrest" "Adult Arrest" "Adult Arrest" ...
## $ Arrest.Year : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ Arrest.Date : chr "2016-01-01" "2016-01-01" "2016-01-01" "2016-01-01" ...
## $ Arrest.Hour : int 0 0 0 0 0 0 1 1 1 1 ...
## $ Age : int 39 27 21 52 33 22 26 27 23 34 ...
## $ Defendant.PSA : chr "Out of State" "Out of State" "Out of State" "603" ...
## $ Defendant.District : chr "Out of State" "Out of State" "Out of State" "6D" ...
## $ Defendant.Race : chr "WHITE" "WHITE" "BLACK" "BLACK" ...
## $ Defendant.Ethnicity : chr "UNKNOWN" "NOT HISPANIC" "NOT HISPANIC" "NOT HISPANIC" ...
## $ Defendant.Sex : chr "MALE" "MALE" "MALE" "MALE" ...
## $ Arrest.Category : chr "Simple Assault" "Simple Assault" "Assault on a Police Officer" "Traffic Violations" ...
## $ Charge.Description : chr "Threats To Do Bodily Harm -misd" "Simple Assault" "Assault On A Police Officer (simple Assault)" "Permit Suspended-oas" ...
## $ Arrest.Location.District : chr "2D" "3D" "3D" "6D" ...
## $ Offense.Location.District: chr "2D" "3D" "3D" "6D" ...
xkablesummary(DF)
| Arrestee.Type | Arrest.Year | Arrest.Date | Arrest.Hour | Age | Defendant.PSA | Defendant.District | Defendant.Race | Defendant.Ethnicity | Defendant.Sex | Arrest.Category | Charge.Description | Arrest.Location.District | Offense.Location.District | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Length:152386 | Min. :2016 | Length:152386 | Min. : 0.00 | Min. : 18.00 | Length:152386 | Length:152386 | Length:152386 | Length:152386 | Length:152386 | Length:152386 | Length:152386 | Length:152386 | Length:152386 |
| Q1 | Class :character | 1st Qu.:2017 | Class :character | 1st Qu.: 6.00 | 1st Qu.: 25.00 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character |
| Median | Mode :character | Median :2018 | Mode :character | Median :12.00 | Median : 32.00 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character |
| Mean | NA | Mean :2018 | NA | Mean :11.81 | Mean : 35.19 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| Q3 | NA | 3rd Qu.:2019 | NA | 3rd Qu.:18.00 | 3rd Qu.: 43.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| Max | NA | Max. :2021 | NA | Max. :23.00 | Max. :121.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Since we are interested in crimes committed by while males, we will
drop rows where the value of Defendant.Race is not
“White”.
In addition, Arrest.Category has some different values
for 2021 and other years:
Therefore, we wil covert these values in 2021 into the correspond values in other years.
DF_WM <- subset(DF, subset = Defendant.Race=='WHITE' & Defendant.Sex=='MALE')
DF_WM <- mutate(DF_WM, Arrest.Category = gsub(Arrest.Category, pattern = "Release Violations/Fugitive.*", replacement = "Release Violations/Fugitive"))
DF_WM <- mutate(DF_WM, Arrest.Category = gsub(Arrest.Category, pattern = "Fraud and Financial Crimes.*", replacement = "Fraud and Financial Crimes"))
str(DF_WM)
## 'data.frame': 12283 obs. of 14 variables:
## $ Arrestee.Type : chr "Adult Arrest" "Adult Arrest" "Adult Arrest" "Adult Arrest" ...
## $ Arrest.Year : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ Arrest.Date : chr "2016-01-01" "2016-01-01" "2016-01-01" "2016-01-01" ...
## $ Arrest.Hour : int 0 0 1 1 13 2 3 3 6 7 ...
## $ Age : int 39 27 27 26 48 25 21 41 29 22 ...
## $ Defendant.PSA : chr "Out of State" "Out of State" "Out of State" "Out of State" ...
## $ Defendant.District : chr "Out of State" "Out of State" "Out of State" "Out of State" ...
## $ Defendant.Race : chr "WHITE" "WHITE" "WHITE" "WHITE" ...
## $ Defendant.Ethnicity : chr "UNKNOWN" "NOT HISPANIC" "HISPANIC" "NOT HISPANIC" ...
## $ Defendant.Sex : chr "MALE" "MALE" "MALE" "MALE" ...
## $ Arrest.Category : chr "Simple Assault" "Simple Assault" "Driving/Boating While Intoxicated" "Simple Assault" ...
## $ Charge.Description : chr "Threats To Do Bodily Harm -misd" "Simple Assault" "Driving While Intoxicated -2nd Off" "Simple Assault" ...
## $ Arrest.Location.District : chr "2D" "3D" "4D" "5D" ...
## $ Offense.Location.District: chr "2D" "3D" "4D" "5D" ...
xkablesummary(DF_WM)
| Arrestee.Type | Arrest.Year | Arrest.Date | Arrest.Hour | Age | Defendant.PSA | Defendant.District | Defendant.Race | Defendant.Ethnicity | Defendant.Sex | Arrest.Category | Charge.Description | Arrest.Location.District | Offense.Location.District | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Length:12283 | Min. :2016 | Length:12283 | Min. : 0.00 | Min. : 18.00 | Length:12283 | Length:12283 | Length:12283 | Length:12283 | Length:12283 | Length:12283 | Length:12283 | Length:12283 | Length:12283 |
| Q1 | Class :character | 1st Qu.:2017 | Class :character | 1st Qu.: 4.00 | 1st Qu.: 25.00 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character |
| Median | Mode :character | Median :2018 | Mode :character | Median :11.00 | Median : 32.00 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character |
| Mean | NA | Mean :2018 | NA | Mean :11.17 | Mean : 34.58 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| Q3 | NA | 3rd Qu.:2019 | NA | 3rd Qu.:18.00 | 3rd Qu.: 42.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| Max | NA | Max. :2021 | NA | Max. :23.00 | Max. :121.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
The number of crimes is as follows. Crime occurrences seem to have decreased after COVID-19.
| year | the number of crimes |
|---|---|
| 2016 | 2620 |
| 2017 | 2636 |
| 2018 | 2298 |
| 2019 | 2193 |
| 2020 | 1426 |
| 2021 | 1110 |
| 2016 - 2019 (before COVID-19) | 2437 |
| 2020 - 2021 (after COVID-19) | 1268 |
We will create some bar plots to see the number of occurrences per
type of crime.
The Bar plot of crimes in 2016 - 2021 is as follows:
ggplot(DF_WM, aes(forcats::fct_infreq(Arrest.Category))) +
ggtitle("Figure 1: Bar plot of crimes in 2016 - 2021") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
The Bar plots of crimes before COVID-19 (2016 - 2019) and after COVID-19 (2020 - 2021) are as follows:
ggplot(subset(DF_WM,Arrest.Year <= 2019), aes(forcats::fct_infreq(Arrest.Category))) +
ggtitle("Figure 2: Bar plot of crimes befor COVID-19") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(subset(DF_WM,Arrest.Year > 2019), aes(forcats::fct_infreq(Arrest.Category))) +
ggtitle("Figure 3: Bar plot of crimes after COVID-19") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
The Bar plots of crimes in each year are as follows:
ggplot(subset(DF_WM,Arrest.Year == 2016), aes(forcats::fct_infreq(Arrest.Category))) +
ggtitle("Figure 4: Bar plot of crimes in 2016") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(subset(DF_WM,Arrest.Year == 2017), aes(forcats::fct_infreq(Arrest.Category))) +
ggtitle("Figure 5: Bar plot of crimes in 2017") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(subset(DF_WM,Arrest.Year == 2018), aes(forcats::fct_infreq(Arrest.Category))) +
ggtitle("Figure 6: Bar plot of crimes in 2018") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(subset(DF_WM,Arrest.Year == 2019), aes(forcats::fct_infreq(Arrest.Category))) +
ggtitle("Figure 7: Bar plot of crimes in 2019") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(subset(DF_WM,Arrest.Year == 2020), aes(forcats::fct_infreq(Arrest.Category))) +
ggtitle("Figure 8: Bar plot of crimes in 2020") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggplot(subset(DF_WM,Arrest.Year == 2021), aes(forcats::fct_infreq(Arrest.Category))) +
ggtitle("Figure 9: Bar plot of crimes in 2021") + xlab("crime types") + geom_bar() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
“Offenses Against Family & Children” have been increasing after COVD-19.
In order to check changes in crime trends, the graph below illustrates, for each crime type, the change over time in the percentage of crimes of that crime type in all crimes.
# table of the number of each crime type in each year
table_freq <- xtabs(~Arrest.Year+Arrest.Category,DF_WM)
# total number of crimes in each year
sum16 <- sum(table_freq[1,])
sum17 <- sum(table_freq[2,])
sum18 <- sum(table_freq[3,])
sum19 <- sum(table_freq[4,])
sum20 <- sum(table_freq[5,])
sum21 <- sum(table_freq[6,])
sum_lst <- c(sum16,sum17,sum18,sum19,sum20,sum21)
# table of the percentage of crimes in each year
table_rfreq <- table_freq/sum_lst
# convert table_rfreq into dataframe
DF_rfreq <- data.frame(table_rfreq)
# convert the data type of 'Arrest.Year' from factor into numeric
DF_rfreq$Arrest.Year <- as.character(DF_rfreq$Arrest.Year) %>% as.numeric()
ggplot(DF_rfreq, aes(x=Arrest.Year, y=Freq, color=Arrest.Category, group=Arrest.Category)) +
ggtitle("Figure 10: Line plot of percentage of a crime vs year") + xlab("Year") +
ylab("Percentage of a crime relative to all crimes") +
geom_line() +
theme(axis.text.x = element_text(angle = 60, hjust = 1),
legend.text = element_text(size=5),
legend.key.size = unit(0.2, 'cm'))
#install.packages("ggrepel")
library(ggrepel)
# list of Arrest.Category sorted by descending order
tmp <- DF_rfreq[DF_rfreq$Arrest.Year==2021,]
tmp <- tmp$Arrest.Category[order(-tmp$Freq)]
tmp <- as.character(tmp)
DF_rfreq_10 <- subset(DF_rfreq, Arrest.Category %in% tmp[c(1:10)])
ggplot(DF_rfreq_10, aes(x=Arrest.Year, y=Freq, color=Arrest.Category, group=Arrest.Category)) +
ggtitle("Figure 11: Line plot of percentage of a crime vs year for top 10 crimes in 2021") +
xlab("Year") +
ylab("Percentage of a crime relative to all crimes") +
geom_line() +
theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position = "none") +
geom_text_repel(
data = subset(DF_rfreq_10, Arrest.Year == 2021),
aes(label = Arrest.Category),
nudge_x = 1,
nudge_y = 0.08,
segment.alpha = 0.3,
size = 2
) +
lims(x = c(2016, 2023))
DF_rfreq_non10 <- subset(DF_rfreq, Arrest.Category %in% tmp[-c(1:10)])
ggplot(DF_rfreq_non10, aes(x=Arrest.Year, y=Freq, color=Arrest.Category, group=Arrest.Category)) +
ggtitle("Figure 12: Line plot of percentage of a crime vs year for other crimes in Fig.11") + xlab("Year") +
ylab("Percentage of a crime relative to all crimes") +
geom_line() +
theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position = "none") +
scale_y_log10() +
geom_text_repel(
data = subset(DF_rfreq_non10, Arrest.Year == 2021),
aes(label = Arrest.Category),
nudge_x = 30,
nudge_y = 0.1,
segment.alpha = 0.3,
size = 2
) +
lims(x = c(2016, 2022))
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
“Simple Assault” and “Offenses Against Family & Children” seem to be increasing after COVID-19. On the other hand, “Traffic Violations” seems to be decreasing after COVID-19.
Since crime is likely to be a rare event, the number of occurrences per day of a given crime is expected to follow Poisson distribution. Poisson distribution is a distribution used to describe the distribution of the number of rare phenomena when a large number of them are observed. If a distribution follows Poisson distribution, and the average number of occurrences of the phenomenon is \(\lambda\), the probability that the phenomenon will occur \(x\) times is given by \[p(x) = \exp(-\lambda)\frac{\lambda^{x}}{x!}.\] In the following, we will estimate \(\lambda\) of each crime before and after COVID-19 to see there is a difference in crime trend.
The trend of “Offenses Against Family & Children,” Domestic Violence (DV), appears to have changed after COVID-19. The frequency table of DV before COVID-19 is as follows.
DF_WM_16_19 <- DF_WM[DF_WM$Arrest.Year%in%c(2016,2017,2018,2019),]
DF_WM_16_19_DV <- DF_WM_16_19[DF_WM_16_19$Arrest.Category=='Offenses Against Family & Children',]
# table of date and the number of occurrences
DV_day_16_19 <- sapply(unique(DF_WM_16_19_DV$Arrest.Date),
function(x){sum(DF_WM_16_19_DV$Arrest.Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 1364 | 0.9336071 |
| 1 | 95 | 0.065024 |
| 2 | 2 | 0.0013689 |
| 3 | 0 | 0 |
We can calculate \(\lambda\) from the above table and \(\lambda = 0.0678\). We will plot the histogram and Poisson distribution with \(\lambda = 0.0678\) to check if they match or not.
x_DV <- 0:5
y_DV <- c(1364,95,2,0,0,0)
fx <- dpois(x=x_DV, lambda=99/(365*4+1))
data_DV <- data.frame(x_DV, y_DV, fx)
ggplot(data_DV, aes(x=x_DV,y=y_DV)) +
ggtitle("Figure 13: Histogram of DV in 2016 - 2019") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_DV) +
ggtitle("Figure 14: Relative frequency histogram of DV in 2016 - 2019 \n and Poisson distribution with lambda = 0.0678") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_DV,y=y_DV/(365*4+1)), stat = "identity") +
geom_line(aes(x=x_DV,y=fx), color='red') +
geom_point(aes(x=x_DV,y=fx), color='red')
We can see that the Poisson distribution fits well with the histogram.
Next, we try to estimate \(99\%\) Confidence Interval of \(\lambda\). The variance of Poisson distribution is equal to its mean (\(\lambda\)). Therefore, \(99\%\) Confidence Interval of \(\lambda\) can be written as \[ \bar{x} - z_{*}\cdot\sqrt{\frac{\bar{x}}{n}} \leq \lambda \leq \bar{x} + z_{*}\cdot\sqrt{\frac{\bar{x}}{n}}, \] where \(\bar{x}\) is the sample mean, \(n\) is the sample size, and \(z_*\) is z-value corresponding to the \(99\%\) confidence interval, and the value is 2.58. From this expression, 99% Confidence Interval of \(\lambda\) for DV before COVID-19 is [0.05, 0.0856].
The frequency table of DV after COVID-19 is as follows.
DF_WM_20_21 <- DF_WM[DF_WM$Arrest.Year%in%c(2020,2021),]
DF_WM_20_21_DV <- DF_WM_20_21[DF_WM_20_21$Arrest.Category=='Offenses Against Family & Children',]
# table of date and the number of occurrences
DV_day_20_21 <- sapply(unique(DF_WM_20_21_DV$Arrest.Date),
function(x){sum(DF_WM_20_21_DV$Arrest.Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 680 | 0.9302326 |
| 1 | 47 | 0.0642955 |
| 2 | 1 | 0.001368 |
| 3 | 0 | 0 |
| 4 | 1 | 0.001368 |
| 5 | 0 | 0 |
| … | 0 | 0 |
| 44 | 0 | 0 |
| 45 | 1 | 0.001368 |
| 46 | 0 | 0 |
| … | 0 | 0 |
| 77 | 0 | 0 |
| 78 | 1 | 0.001368 |
| 79 | 0 | 0 |
| … | 0 | 0 |
There are two outliers (45 and 78) in the table. The dates of them are 2021/1/6 and 6/1/2020. Since these dates are correspond to “Capitol attack” and “George Floyd protests”, we will drop the value of these dates.
The calculated \(\lambda = 0.0725\). The histogram and the poisson distribution with \(\lambda = 0.0725\) are shown in Figure 16.
x_DV <- 0:5
y_DV <- c(680,47,1,0,1,0)
fx <- dpois(x=x_DV, lambda=53/(365*2+1))
data_DV <- data.frame(x_DV, y_DV, fx)
ggplot(data_DV, aes(x=x_DV,y=y_DV)) +
ggtitle("Figure 15: Histogram of DV in 2020 - 2021") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_DV) +
ggtitle("Figure 16: Reralive frequency histogram of DV in 2020 - 2021 \n and Pission distribution with lambda = 0.0725") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_DV,y=y_DV/(365*2+1)), stat = "identity") +
geom_line(aes(x=x_DV,y=fx), color='red') +
geom_point(aes(x=x_DV,y=fx), color='red')
The Poisson distribution fits well with the histogram.
99% Confidence Interval of \(\lambda\) for DV after COVID-19 is [0.0465, 0.0985].
Figure 17 shows the Confidence Intervals before and after COVID-19. There is overlap in the Confidence Intervals, and it is not possible to say that there was a change in the \(\lambda\) of “Offenses Against Family & Children” before or after COVID-19.
x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(99/(365*4+1)-2.58*(99/(365*4+1)/(356*4+1))**0.5, 99/(365*4+1)+ 2.58*(99/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(53/(365*2+1) - 2.58*(53/(365*2+1)/(356*2+1))**0.5, 53/(365*2+1) + 2.58*(53/(365*2+1)/(356*2+1))**0.5)
data_CI_DV <- data.frame(x,pre_covid_interval,post_covid_interval)
ggplot(data_CI_DV) +
ggtitle("Figure 17: 99% Confidence Interval of lambda for DV") +
xlab("") +
ylab("99% Confidence Interval of lambda") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))
The trend of “Traffic Violations” also appears to have changed after COVID-19. The frequency table of Traffic Violations before COVID-19 is as follows.
DF_WM_16_19_TV <- DF_WM_16_19[DF_WM_16_19$Arrest.Category=='Traffic Violations',]
# table of date and the number of occurrences
TV_day_16_19 <- sapply(unique(DF_WM_16_19_TV$Arrest.Date),
function(x){sum(DF_WM_16_19_TV$Arrest.Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 602 | 0.4123288 |
| 1 | 530 | 0.3627652 |
| 2 | 225 | 0.1540041 |
| 3 | 77 | 0.0527036 |
| 4 | 22 | 0.0150582 |
| 5 | 4 | 0.0027379 |
| 6 | 1 | 6.844627^{-4} |
| 7 | 0 | 0 |
The calculated \(\lambda = 0.907\). The histogram and the poisson distribution with \(\lambda = 0.907\) are shown in Figure 19.
x_TV <- 0:10
y_TV <- c(602,530,225,77,22,4,1,0,0,0,0)
fx <- dpois(x=x_TV, lambda=sum(TV_day_16_19)/(365*4+1))
data_TV <- data.frame(x_TV, y_TV, fx)
ggplot(data_TV, aes(x=x_TV,y=y_TV)) +
ggtitle("Figure 18: Histogram of traffic violations in 2016 - 2019") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_TV) +
ggtitle("Figure 19: Relative frequency histogram of traffic violations in 2016 - 2019 \n and Pission distribution with lambda = 0.907") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_TV,y=y_TV/(365*4+1)), stat = "identity") +
geom_line(aes(x=x_TV,y=fx), color="red") +
geom_point(aes(x=x_TV,y=fx), color='red')
The Poisson distribution fits well with the histogram.
99% Confidence Interval of \(\lambda\) for Traffic Violations before COVID-19 is [0.842, 0.972].
The frequency table of Traffic Violations after COVID-19 is as follows.
DF_WM_20_21_TV <- DF_WM_20_21[DF_WM_20_21$Arrest.Category=='Traffic Violations',]
# table of date and the number of occurrences
TV_day_20_21 <- sapply(unique(DF_WM_20_21_TV$Arrest.Date),
function(x){sum(DF_WM_20_21_TV$Arrest.Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 545 | 0.745554 |
| 1 | 157 | 0.2147743 |
| 2 | 23 | 0.0314637 |
| 3 | 3 | 0.004104 |
| 4 | 3 | 0.004104 |
| 5 | 0 | 0 |
The calculated \(\lambda = 0.306\). The histogram and the poisson distribution with \(\lambda = 0.306\) are shown in Figure 21.
x_TV <- 0:10
y_TV <- c(545,157,23,3,3,0,0,0,0,0,0)
fx <- dpois(x=x_TV, lambda=sum(TV_day_20_21)/(365*2+1))
data_TV <- data.frame(x_TV, y_TV, fx)
ggplot(data_TV, aes(x=x_TV,y=y_TV)) +
ggtitle("Figure 20: Histogram of traffic violations in 2020 - 2021") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_TV) +
ggtitle("Figure 21: Relative frequency histogram of traffic violations in 2020 - 2021 \n and Pission distribution with lambda = 0.306") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_TV,y=y_TV/(365*2+1)), stat = "identity") +
geom_line(aes(x=x_TV,y=fx), color="red") +
geom_point(aes(x=x_TV,y=fx), color='red')
The Poisson distribution fits well with the histogram.
99% Confidence Interval of \(\lambda\) for Traffic Violations before COVID-19 is [0.253, 0.36].
Figure 22 shows the Confidence Intervals before and after COVID-19. There is no overlap in the Confidence Intervals, and there may have been a change in the Traffic Violations’ lambda before and after COVID-19.
x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(sum(TV_day_16_19)/(365*4+1) - 2.58*(sum(TV_day_16_19)/(365*4+1)/(356*4+1))**0.5,
sum(TV_day_16_19)/(365*4+1) + 2.58*(sum(TV_day_16_19)/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(sum(TV_day_20_21)/(365*2+1) - 2.58*(sum(TV_day_20_21)/(365*2+1)/(356*2+1))**0.5,
sum(TV_day_20_21)/(365*2+1) + 2.58*(sum(TV_day_20_21)/(365*2+1)/(356*2+1))**0.5)
data_CI_TV <- data.frame(x,pre_covid_interval,post_covid_interval)
ggplot(data_CI_TV) +
ggtitle("Figure 22: 99% Confidence Interval of lambda for Traffic Violations") +
ylab("99% Confidence Interval of lambda") +
xlab("") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))
DF_WM_16_19_SA <- DF_WM_16_19[DF_WM_16_19$Arrest.Category=='Simple Assault',]
# table of date and the number of occurrences
SA_day_16_19 <- sapply(unique(DF_WM_16_19_SA$Arrest.Date),
function(x){sum(DF_WM_16_19_SA$Arrest.Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 438 | 0.2997947 |
| 1 | 479 | 0.3278576 |
| 2 | 284 | 0.1943874 |
| 3 | 156 | 0.1067762 |
| 4 | 76 | 0.0520192 |
| 5 | 13 | 0.008898 |
| 6 | 8 | 0.0054757 |
| 7 | 4 | 0.0027379 |
| 8 | 1 | 6.844627^{-4} |
| 9 | 2 | 0.0013689 |
| 10 | 0 | 0 |
x_SA <- 0:10
y_SA <- c(438,479,284,156,76,13,8,4,1,2,0)
fx <- dpois(x=x_SA, lambda=sum(SA_day_16_19)/(365*4+1))
data_SA <- data.frame(x_SA, y_SA, fx)
ggplot(data_SA, aes(x=x_SA,y=y_SA)) +
ggtitle("Figure 23: Histogram of Simple Assault in 2016 - 2019") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_SA) +
ggtitle("Figure 24: Relative frequency histogram of Simple Assault in 2016 - 2019 \n and Poisson distribution with lambda = 1.36") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_SA,y=y_SA/(365*4+1)), stat = "identity") +
geom_line(aes(x=x_SA,y=fx), color='red') +
geom_point(aes(x=x_SA,y=fx), color='red')
DF_WM_20_21_SA <- DF_WM_20_21[DF_WM_20_21$Arrest.Category=='Simple Assault',]
# table of date and the number of occurrences
SA_day_20_21 <- sapply(unique(DF_WM_20_21_SA$Arrest.Date),
function(x){sum(DF_WM_20_21_SA$Arrest.Date==x)})
| # of occurrences per day | Frequency | Relative frequency |
|---|---|---|
| 0 | 297 | 0.4062927 |
| 1 | 267 | 0.3652531 |
| 2 | 112 | 0.1532148 |
| 3 | 43 | 0.0588235 |
| 4 | 8 | 0.0109439 |
| 5 | 3 | 0.004104 |
| 6 | 0 | 0 |
| 7 | 0 | 0 |
| 8 | 1 | 0.001368 |
| 9 | 0 | 0 |
x_SA <- 0:10
y_SA <- c(297,267,112,43,8,3,0,0,1,0,0)
fx <- dpois(x=x_SA, lambda=sum(SA_day_20_21)/(365*2+1))
data_SA <- data.frame(x_SA, y_SA, fx)
ggplot(data_SA, aes(x=x_SA,y=y_SA)) +
ggtitle("Figure 25: Histogram of Simple Assault in 2020 - 2021") +
xlab("Number of occurrences per day") +
ylab("Frequency") +
geom_bar(stat = "identity")
ggplot(data_SA) +
ggtitle("Figure 25: Relative frequency histogram of Simple Assault in 2020 - 2021 \n and Poisson distribution with lambda = 0.923") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_SA,y=y_SA/(365*2+1)), stat = "identity") +
geom_line(aes(x=x_SA,y=fx), color='red') +
geom_point(aes(x=x_SA,y=fx), color='red')
x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(sum(SA_day_16_19)/(365*4+1) - 2.58*(sum(SA_day_16_19)/(365*4+1)/(356*4+1))**0.5,
sum(SA_day_16_19)/(365*4+1) + 2.58*(sum(SA_day_16_19)/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(sum(SA_day_20_21)/(365*2+1) - 2.58*(sum(SA_day_20_21)/(365*2+1)/(356*2+1))**0.5,
sum(SA_day_20_21)/(365*2+1) + 2.58*(sum(SA_day_20_21)/(365*2+1)/(356*2+1))**0.5)
data_CI_SA <- data.frame(x,pre_covid_interval,post_covid_interval)
ggplot(data_CI_SA) +
ggtitle("Figure 26: 99% Confidence Interval of lambda for Simple Assault") +
ylab("99% Confidence Interval of lambda") +
xlab("") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))
# table of date and the number of occurrences
SA_day_16 <- sapply(unique(DF_WM_16_19_SA[DF_WM_16_19_SA$Arrest.Year==2016,]$Arrest.Date),
function(x){sum(DF_WM_16_19_SA[DF_WM_16_19_SA$Arrest.Year==2016,]$Arrest.Date==x)})
SA_day_17 <- sapply(unique(DF_WM_16_19_SA[DF_WM_16_19_SA$Arrest.Year==2017,]$Arrest.Date),
function(x){sum(DF_WM_16_19_SA[DF_WM_16_19_SA$Arrest.Year==2017,]$Arrest.Date==x)})
SA_day_18 <- sapply(unique(DF_WM_16_19_SA[DF_WM_16_19_SA$Arrest.Year==2018,]$Arrest.Date),
function(x){sum(DF_WM_16_19_SA[DF_WM_16_19_SA$Arrest.Year==2018,]$Arrest.Date==x)})
SA_day_19 <- sapply(unique(DF_WM_16_19_SA[DF_WM_16_19_SA$Arrest.Year==2019,]$Arrest.Date),
function(x){sum(DF_WM_16_19_SA[DF_WM_16_19_SA$Arrest.Year==2019,]$Arrest.Date==x)})
# 2016
x_SA <- 0:10
y_SA <- c(365+1-length(SA_day_16),length(SA_day_16[SA_day_16==1]),length(SA_day_16[SA_day_16==2]),
length(SA_day_16[SA_day_16==3]),length(SA_day_16[SA_day_16==4]),
length(SA_day_16[SA_day_16==5]),length(SA_day_16[SA_day_16==6]),
length(SA_day_16[SA_day_16==7]),length(SA_day_16[SA_day_16==8]),
length(SA_day_16[SA_day_16==9]),length(SA_day_16[SA_day_16==10]))
fx <- dpois(x=x_SA, lambda=sum(SA_day_16)/(365+1))
data_SA <- data.frame(x_SA, y_SA, fx)
ggplot(data_SA) +
ggtitle("Figure: Relative frequency histogram of Simple Assault in 2016
\n and Poisson distribution with lambda = 1.5") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_SA,y=y_SA/(365+1)), stat = "identity") +
geom_line(aes(x=x_SA,y=fx), color='red') +
geom_point(aes(x=x_SA,y=fx), color='red')
# 2017
x_SA <- 0:10
y_SA <- c(365-length(SA_day_17),length(SA_day_17[SA_day_17==1]),length(SA_day_17[SA_day_17==2]),
length(SA_day_17[SA_day_17==3]),length(SA_day_17[SA_day_17==4]),
length(SA_day_17[SA_day_17==5]),length(SA_day_17[SA_day_17==6]),
length(SA_day_17[SA_day_17==7]),length(SA_day_17[SA_day_17==8]),
length(SA_day_17[SA_day_17==9]),length(SA_day_17[SA_day_17==10]))
fx <- dpois(x=x_SA, lambda=sum(SA_day_17)/365)
data_SA <- data.frame(x_SA, y_SA, fx)
ggplot(data_SA) +
ggtitle("Figure: Relative frequency histogram of Simple Assault in 2017
\n and Poisson distribution with lambda = 1.33") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_SA,y=y_SA/365), stat = "identity") +
geom_line(aes(x=x_SA,y=fx), color='red') +
geom_point(aes(x=x_SA,y=fx), color='red')
# 2018
x_SA <- 0:10
y_SA <- c(365-length(SA_day_18),length(SA_day_18[SA_day_18==1]),length(SA_day_18[SA_day_18==2]),
length(SA_day_18[SA_day_18==3]),length(SA_day_18[SA_day_18==4]),
length(SA_day_18[SA_day_18==5]),length(SA_day_18[SA_day_18==6]),
length(SA_day_18[SA_day_18==7]),length(SA_day_18[SA_day_18==8]),
length(SA_day_18[SA_day_18==9]),length(SA_day_18[SA_day_18==10]))
fx <- dpois(x=x_SA, lambda=sum(SA_day_18)/365)
data_SA <- data.frame(x_SA, y_SA, fx)
ggplot(data_SA) +
ggtitle("Figure: Relative frequency histogram of Simple Assault in 2018
\n and Poisson distribution with lambda = 1.32") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_SA,y=y_SA/365), stat = "identity") +
geom_line(aes(x=x_SA,y=fx), color='red') +
geom_point(aes(x=x_SA,y=fx), color='red')
# 2019
x_SA <- 0:10
y_SA <- c(365-length(SA_day_19),length(SA_day_19[SA_day_19==1]),length(SA_day_19[SA_day_19==2]),
length(SA_day_19[SA_day_19==3]),length(SA_day_19[SA_day_19==4]),
length(SA_day_19[SA_day_19==5]),length(SA_day_19[SA_day_19==6]),
length(SA_day_19[SA_day_19==7]),length(SA_day_19[SA_day_19==8]),
length(SA_day_19[SA_day_19==9]),length(SA_day_19[SA_day_19==10]))
fx <- dpois(x=x_SA, lambda=sum(SA_day_19)/365)
data_SA <- data.frame(x_SA, y_SA, fx)
ggplot(data_SA) +
ggtitle("Figure: Relative frequency histogram of Simple Assault in 2019
\n and Poisson distribution with lambda = 1.3") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_SA,y=y_SA/365), stat = "identity") +
geom_line(aes(x=x_SA,y=fx), color='red') +
geom_point(aes(x=x_SA,y=fx), color='red')
# table of date and the number of occurrences
SA_day_20 <- sapply(unique(DF_WM_20_21_SA[DF_WM_20_21_SA$Arrest.Year==2020,]$Arrest.Date),
function(x){sum(DF_WM_20_21_SA[DF_WM_20_21_SA$Arrest.Year==2020,]$Arrest.Date==x)})
SA_day_21 <- sapply(unique(DF_WM_20_21_SA[DF_WM_20_21_SA$Arrest.Year==2021,]$Arrest.Date),
function(x){sum(DF_WM_20_21_SA[DF_WM_20_21_SA$Arrest.Year==2021,]$Arrest.Date==x)})
# 2020
x_SA <- 0:10
y_SA <- c(365+1-length(SA_day_20),length(SA_day_20[SA_day_20==1]),length(SA_day_20[SA_day_20==2]),
length(SA_day_20[SA_day_20==3]),length(SA_day_20[SA_day_20==4]),
length(SA_day_20[SA_day_20==5]),length(SA_day_20[SA_day_20==6]),
length(SA_day_20[SA_day_20==7]),length(SA_day_20[SA_day_20==8]),
length(SA_day_20[SA_day_20==9]),length(SA_day_20[SA_day_20==10]))
fx <- dpois(x=x_SA, lambda=sum(SA_day_20)/(365+1))
data_SA <- data.frame(x_SA, y_SA, fx)
ggplot(data_SA) +
ggtitle("Figure: Relative frequency histogram of Simple Assault in 2020
\n and Poisson distribution with lambda = 0.954") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_SA,y=y_SA/(365+1)), stat = "identity") +
geom_line(aes(x=x_SA,y=fx), color='red') +
geom_point(aes(x=x_SA,y=fx), color='red')
# 2021
x_SA <- 0:10
y_SA <- c(365-length(SA_day_21),length(SA_day_21[SA_day_21==1]),length(SA_day_21[SA_day_21==2]),
length(SA_day_21[SA_day_21==3]),length(SA_day_21[SA_day_21==4]),
length(SA_day_21[SA_day_21==5]),length(SA_day_21[SA_day_21==6]),
length(SA_day_21[SA_day_21==7]),length(SA_day_21[SA_day_21==8]),
length(SA_day_21[SA_day_21==9]),length(SA_day_21[SA_day_21==10]))
fx <- dpois(x=x_SA, lambda=sum(SA_day_21)/365)
data_SA <- data.frame(x_SA, y_SA, fx)
ggplot(data_SA) +
ggtitle("Figure: Relative frequency histogram of Simple Assault in 2021
\n and Poisson distribution with lambda = 0.893") +
xlab("Number of occurrences per day") +
ylab("Relative frequency") +
geom_bar(aes(x=x_SA,y=y_SA/365), stat = "identity") +
geom_line(aes(x=x_SA,y=fx), color='red') +
geom_point(aes(x=x_SA,y=fx), color='red')
x <- c('2016','2017','2018','2019','2020','2021')
interval_2016 <- c(sum(SA_day_16)/(365+1) - 2.58*(sum(SA_day_16)/(365+1)/(356+1))**0.5,
sum(SA_day_16)/(365+1) + 2.58*(sum(SA_day_16)/(365+1)/(356+1))**0.5)
interval_2017 <- c(sum(SA_day_17)/(365) - 2.58*(sum(SA_day_17)/(365)/(356))**0.5,
sum(SA_day_17)/(365) + 2.58*(sum(SA_day_17)/(365)/(356))**0.5)
interval_2018 <- c(sum(SA_day_18)/(365) - 2.58*(sum(SA_day_18)/(365)/(356))**0.5,
sum(SA_day_18)/(365) + 2.58*(sum(SA_day_18)/(365)/(356))**0.5)
interval_2019 <- c(sum(SA_day_19)/(365) - 2.58*(sum(SA_day_19)/(365)/(356))**0.5,
sum(SA_day_19)/(365) + 2.58*(sum(SA_day_19)/(365)/(356))**0.5)
interval_2020 <- c(sum(SA_day_20)/(365+1) - 2.58*(sum(SA_day_20)/(365+1)/(356+1))**0.5,
sum(SA_day_20)/(365+1) + 2.58*(sum(SA_day_20)/(365+1)/(356+1))**0.5)
interval_2021 <- c(sum(SA_day_21)/(365) - 2.58*(sum(SA_day_21)/(365)/(356))**0.5,
sum(SA_day_21)/(365) + 2.58*(sum(SA_day_21)/(365)/(356))**0.5)
data_CI_SA <- data.frame(x,interval_2016,interval_2017,interval_2018,interval_2019,interval_2020,interval_2021)
ggplot(data_CI_SA) +
ggtitle("Figure: 99% Confidence Interval of lambda for Simple Assault") +
ylab("99% Confidence Interval of lambda") +
xlab("") +
coord_flip() +
geom_linerange(aes(x=x[1], ymin=interval_2016[1], ymax=interval_2016[2])) +
geom_linerange(aes(x=x[2], ymin=interval_2017[1], ymax=interval_2017[2])) +
geom_linerange(aes(x=x[3], ymin=interval_2018[1], ymax=interval_2018[2])) +
geom_linerange(aes(x=x[4], ymin=interval_2019[1], ymax=interval_2019[2])) +
geom_linerange(aes(x=x[5], ymin=interval_2020[1], ymax=interval_2020[2])) +
geom_linerange(aes(x=x[6], ymin=interval_2021[1], ymax=interval_2021[2]))
mean16 = sum(SA_day_16)/(365+1)
mean17 = sum(SA_day_17)/(365)
mean18 = sum(SA_day_18)/(365)
mean19 = sum(SA_day_19)/(365)
mean20 = sum(SA_day_20)/(365+1)
mean21 = sum(SA_day_21)/(365)
var16 = (((0-mean16)**2)*(366 - length(SA_day_16)) + ((1-mean16)**2)*length(SA_day_16[SA_day_16==1]) +
((2-mean16)**2)*length(SA_day_16[SA_day_16==2]) + ((3-mean16)**2)*length(SA_day_16[SA_day_16==3]) +
((4-mean16)**2)*length(SA_day_16[SA_day_16==4]) + ((5-mean16)**2)*length(SA_day_16[SA_day_16==5]) +
((6-mean16)**2)*length(SA_day_16[SA_day_16==6]) + ((7-mean16)**2)*length(SA_day_16[SA_day_16==7]) +
((8-mean16)**2)*length(SA_day_16[SA_day_16==8]) + ((9-mean16)**2)*length(SA_day_16[SA_day_16==9]))/365
var17 = (((0-mean17)**2)*(365 - length(SA_day_17)) + ((1-mean17)**2)*length(SA_day_17[SA_day_17==1]) +
((2-mean17)**2)*length(SA_day_17[SA_day_17==2]) + ((3-mean17)**2)*length(SA_day_17[SA_day_17==3]) +
((4-mean17)**2)*length(SA_day_17[SA_day_17==4]) + ((5-mean17)**2)*length(SA_day_17[SA_day_17==5]) +
((6-mean17)**2)*length(SA_day_17[SA_day_17==6]) + ((7-mean17)**2)*length(SA_day_17[SA_day_17==7]) +
((8-mean17)**2)*length(SA_day_17[SA_day_17==8]) + ((9-mean17)**2)*length(SA_day_17[SA_day_17==9]))/364
var18 = (((0-mean18)**2)*(366 - length(SA_day_18)) + ((1-mean18)**2)*length(SA_day_18[SA_day_18==1]) +
((2-mean18)**2)*length(SA_day_18[SA_day_18==2]) + ((3-mean18)**2)*length(SA_day_18[SA_day_18==3]) +
((4-mean18)**2)*length(SA_day_18[SA_day_18==4]) + ((5-mean18)**2)*length(SA_day_18[SA_day_18==5]) +
((6-mean18)**2)*length(SA_day_18[SA_day_18==6]) + ((7-mean18)**2)*length(SA_day_18[SA_day_18==7]) +
((8-mean18)**2)*length(SA_day_18[SA_day_18==8]) + ((9-mean18)**2)*length(SA_day_18[SA_day_18==9]))/364
var19 = (((0-mean19)**2)*(366 - length(SA_day_19)) + ((1-mean19)**2)*length(SA_day_19[SA_day_19==1]) +
((2-mean19)**2)*length(SA_day_19[SA_day_19==2]) + ((3-mean19)**2)*length(SA_day_19[SA_day_19==3]) +
((4-mean19)**2)*length(SA_day_19[SA_day_19==4]) + ((5-mean19)**2)*length(SA_day_19[SA_day_19==5]) +
((6-mean19)**2)*length(SA_day_19[SA_day_19==6]) + ((7-mean19)**2)*length(SA_day_19[SA_day_19==7]) +
((8-mean19)**2)*length(SA_day_19[SA_day_19==8]) + ((9-mean19)**2)*length(SA_day_19[SA_day_19==9]))/364
var20 = (((0-mean20)**2)*(366 - length(SA_day_20)) + ((1-mean20)**2)*length(SA_day_20[SA_day_20==1]) +
((2-mean20)**2)*length(SA_day_16[SA_day_20==2]) + ((3-mean20)**2)*length(SA_day_20[SA_day_20==3]) +
((4-mean20)**2)*length(SA_day_16[SA_day_20==4]) + ((5-mean20)**2)*length(SA_day_20[SA_day_20==5]) +
((6-mean20)**2)*length(SA_day_16[SA_day_20==6]) + ((7-mean20)**2)*length(SA_day_20[SA_day_20==7]) +
((8-mean20)**2)*length(SA_day_16[SA_day_20==8]) + ((9-mean20)**2)*length(SA_day_20[SA_day_20==9]))/365
var21 = (((0-mean21)**2)*(366 - length(SA_day_21)) + ((1-mean21)**2)*length(SA_day_21[SA_day_21==1]) +
((2-mean21)**2)*length(SA_day_21[SA_day_21==2]) + ((3-mean21)**2)*length(SA_day_21[SA_day_21==3]) +
((4-mean21)**2)*length(SA_day_21[SA_day_21==4]) + ((5-mean21)**2)*length(SA_day_21[SA_day_21==5]) +
((6-mean21)**2)*length(SA_day_21[SA_day_21==6]) + ((7-mean21)**2)*length(SA_day_21[SA_day_21==7]) +
((8-mean21)**2)*length(SA_day_21[SA_day_21==8]) + ((9-mean21)**2)*length(SA_day_21[SA_day_21==9]))/364
print('The mean and variance of SA in 2016 are ')
mean16
var16
print('The mean and variance of SA in 2017 are ')
mean17
var17
print('The mean and variance of SA in 2018 are ')
mean18
var18
print('The mean and variance of SA in 2019 are ')
mean19
var19
print('The mean and variance of SA in 2020 are ')
mean20
var20
print('The mean and variance of SA in 2021 are ')
mean21
var21